Estimated change in pneumonia in Latin American countries, subnational analyses

Daniel Weinberger

2019-08-08


Installing the package

Uncomment and run the following 3 lines of code to install the package. You will be prompted to manually select a number to indicate whether to update dependencies and which ones.

#install.packages('devtools') 
#library(devtools) 
#devtools::install_github('https://github.com/weinbergerlab/InterventionEvaluatR') 

Analysis goal

The goal for this analysis is to quantify changes in the rates of hospitalization due to pneumonia in different regions of Latin American countries


View dataset

Let’s load the data, which are included with the package

  ds<-read.csv('./prelogNEW_Mexico_state.csv')
  ds<-ds[substr(ds$age_group,1,1) =='9'  ,]
  head(ds)
#>       age_group       date J12_18 A10_B99_nopneumo A17 A18 A19 A39 A41
#> 37633       9_1 2000-01-01     20               NA  NA  NA  NA  NA  NA
#> 37634       9_1 2000-02-01     13               NA  NA  NA  NA  NA  NA
#> 37635       9_1 2000-03-01     15               NA  NA  NA  NA  NA  NA
#> 37636       9_1 2000-04-01     10               NA  NA  NA  NA  NA  NA
#> 37637       9_1 2000-05-01      7               NA  NA  NA  NA  NA  NA
#> 37638       9_1 2000-06-01      2               NA  NA  NA  NA  NA  NA
#>       B20_24 B34 B96 B97 B99 C00_D48 D50_89 E00_99 E10_14 E40_46 G00_99_SY
#> 37633     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#> 37634     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#> 37635     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#> 37636     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#> 37637     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#> 37638     NA  NA  NA  NA  NA      NA     NA     NA     NA     NA        NA
#>       H00_99_SY I00_99 I60_64 cJ20_J22 K00_99 K35 K80 L00_99 M00_99 N00_99
#> 37633        NA     NA     NA       13     NA  NA  NA     NA     NA     NA
#> 37634        NA     NA     NA       17     NA  NA  NA     NA     NA     NA
#> 37635        NA     NA     NA        5     NA  NA  NA     NA     NA     NA
#> 37636        NA     NA     NA        6     NA  NA  NA     NA     NA     NA
#> 37637        NA     NA     NA        7     NA  NA  NA     NA     NA     NA
#> 37638        NA     NA     NA        5     NA  NA  NA     NA     NA     NA
#>       N39 P00_99 P05_07 Q00_99 S00_T99 U00_99 V00_Y99 Z00_99 ach_noj
#> 37633  NA     90     26     12      NA     NA      NA     NA     129
#> 37634  NA     80     18      5      NA     NA      NA     NA     113
#> 37635  NA     98     16      4      NA     NA      NA     NA     133
#> 37636  NA    106     26     12      NA     NA      NA     NA     146
#> 37637  NA     87     18      6      NA     NA      NA     NA     129
#> 37638  NA     85     27      4      NA     NA      NA     NA     116
  #ds$G00_99_SY <-NULL #Exclude meningitis as a control 

Ensure your date variable is in an R date format. If your variable is in a character or factor format, you need to tell R the format. – %m is a 2 digit month; %b is a month abbreviation (ie Jan, Feb) – %d is a 2 digit day (0-31), – %Y is a 4 digit year (e.g. 2011), %y is a 2 digit year (e.g. 11).
These codes are separated by a dash or slash or space. Modify the tryFormats script below if needed to match the format in your dataset

ds$date<-as.Date(ds$date, tryFormats=c('%Y-%m-%d',
                                                    '%m-%d-%Y',
                                                    '%m/%d/%Y',
                                                    '%Y/%m/%d',
                                                    '%d/%m/%Y'
                                                    ) )

Sort data by age group and month

    ds<-ds[order(ds$age_group, ds$date),] #Sort data by age group and month

Set parameters for analysis

Here we need to set a few parameters. We Use the evaluatr.init() function to specify the name of the dataset, the date at which the vaccine is introduced, the date at which we want to begin evaluating the vaccine (typically 1-2 year after vaccine introduction). We also provide some information on the dataset, sch as whether the data are monthly or quarterly (n_seasons), the variable names for the grouping variable, the date variable, the outcome variable, and the denominator variable (if any).


analysis <- evaluatr.init(
  country = "Mexico_state", data = ds,
  post_period_start = "2008-02-01", #First 'post-intervention' month is Jan 2012
  eval_period_start = "2009-02-01", #We ignore first 12 month of data to allow for vaccine ramp up
  eval_period_end = "2013-12-01", #The evaluation period lasts 2 years
  n_seasons = 12, #This is monthly data, so select 12
  year_def = "epi_year", # we are in southern hemisphere, so aggregate results by calendar year (Jan-Dec)
  group_name = "age_group",  #Strata categry name
  date_name = "date", #Date variable name
  outcome_name = "J12_18", #Outcome variable name
  denom_name = "ach_noj" ,#Denominator variable name
    set.burnN=10000,
  set.sampleN=5000
)
set.seed(1)

Run the main analysis

Save the results in object ‘impact_results’

n.iter<-ncol(impact_results$full$rr_iter)
thin=1
trace1<-impact_results$full$rr_iter[,seq(from=1, to=n.iter, by=thin)] #thin
for(i in 1: nrow(trace1)){
  geweke.p<- pnorm(abs(geweke.diag(mcmc(trace1[i,]))$z),lower.tail=FALSE)*2
  print(geweke.p)
if(geweke.p>0.05){con.stat<-'Model converged'
  }else{
    con.stat<-'Not converged'
  } 
   plot(trace1[i,], type='l', main=paste0(con.stat,' ',colnames(trace1)[i]), bty='l', ylim=c(0.2,2))
}
#>     var1 
#> 0.362741

#>      var1 
#> 0.2256035

#>     var1 
#> 0.532991

#>        var1 
#> 0.004133511

#>      var1 
#> 0.1015424

#>      var1 
#> 0.7423961

#>        var1 
#> 0.004454227

#>      var1 
#> 0.7486577

#>      var1 
#> 0.2652687

#>      var1 
#> 0.8571663

#>      var1 
#> 0.3705752

#>      var1 
#> 0.2969788

#>      var1 
#> 0.6601505

#>      var1 
#> 0.9622489

#>      var1 
#> 0.3925018

#>       var1 
#> 0.04765536

#>      var1 
#> 0.6839096

#>      var1 
#> 0.4085838

#>      var1 
#> 0.3999231

#>      var1 
#> 0.5864752

#>     var1 
#> 0.925462

#>      var1 
#> 0.9224339

#>      var1 
#> 0.8633828

#>      var1 
#> 0.5127307

#>      var1 
#> 0.9975958

#>       var1 
#> 0.02664334

#>     var1 
#> 0.182912

#>       var1 
#> 0.01926025

#>      var1 
#> 0.3963716

Results

Compare estimates from different models

This shows the estimated rate ratio and 95% credible intervals from a synthetic controls analysis; a time-trend analysis where we used the specified denominator (all non-respiratory deaths) to adjust the number of pneumonia deaths in each month and a linear trend for time; a classic interrupted time series analysis (segmented regression); and the STL+PCA approach, which smooths and combines the control variables prior to including them in the model.

results.table<- cbind.data.frame(
  #impact_results$best$rr_mean_intervals, 
  impact_results$full$rr_mean_intervals, 
  impact_results$time$rr_mean_intervals, 
  #impact_results$time_no_offset$rr_mean_intervals, 
  impact_results$its$rr_mean_intervals, 
  impact_results$pca$rr_mean_intervals)

  table<-xtable(results.table)
  htmlTable(table)
Synthetic controls Estimate (95% CI) Time trend Estimate (95% CI) Classic ITS (95% CI) STL+PCA Estimate (95% CI)
9_1 0.87 (0.61, 1.15) 0.58 (0.37, 0.89) 0.63 (0.45, 0.89) 0.62 (0.42, 0.87)
9_10 0.91 (0.71, 1.16) 0.82 (0.56, 1.21) 1.05 (0.77, 1.46) 1.18 (0.77, 1.87)
9_11 1 (0.85, 1.2) 1.24 (0.97, 1.58) 1.2 (0.96, 1.49) 1.03 (0.82, 1.26)
9_12 0.81 (0.7, 0.99) 0.84 (0.6, 1.17) 0.89 (0.64, 1.22) 0.76 (0.48, 1.15)
9_13 0.84 (0.58, 1.22) 0.79 (0.52, 1.19) 0.82 (0.57, 1.17) 0.75 (0.5, 1.12)
9_14 1.22 (1.02, 1.47) 0.95 (0.73, 1.25) 0.89 (0.67, 1.19) 0.93 (0.75, 1.14)
9_15 0.91 (0.78, 1.04) 1.09 (0.81, 1.5) 1.11 (0.85, 1.44) 0.65 (0.5, 0.83)
9_16 0.84 (0.7, 1.04) 0.8 (0.59, 1.06) 0.79 (0.61, 1.04) 0.63 (0.49, 0.82)
9_17 0.76 (0.64, 0.93) 1.07 (0.7, 1.56) 1.11 (0.77, 1.59) 0.6 (0.39, 0.85)
9_19 0.65 (0.56, 0.83) 1.06 (0.79, 1.4) 0.9 (0.66, 1.23) 0.57 (0.33, 0.96)
9_2 0.93 (0.73, 1.15) 1.04 (0.73, 1.45) 1.01 (0.77, 1.33) 0.92 (0.7, 1.2)
9_20 1 (0.87, 1.15) 1.1 (0.78, 1.59) 1.08 (0.8, 1.46) 0.83 (0.61, 1.12)
9_21 0.93 (0.8, 1.05) 0.89 (0.65, 1.2) 0.79 (0.59, 1.07) 0.73 (0.6, 0.89)
9_22 0.85 (0.67, 1.17) 1.22 (0.89, 1.71) 1.27 (0.92, 1.72) 1.3 (0.95, 1.82)
9_23 1.1 (0.86, 1.36) 0.96 (0.61, 1.44) 0.94 (0.65, 1.35) 1.01 (0.79, 1.29)
9_24 0.74 (0.6, 0.89) 0.77 (0.58, 1.01) 0.73 (0.57, 0.94) 0.68 (0.54, 0.86)
9_25 0.82 (0.66, 1.06) 1 (0.67, 1.47) 1.02 (0.75, 1.37) 0.76 (0.56, 1.05)
9_26 0.66 (0.41, 0.78) 0.63 (0.44, 0.88) 0.85 (0.64, 1.13) 1.05 (0.46, 2.53)
9_27 1.13 (1, 1.28) 1.07 (0.76, 1.51) 0.75 (0.54, 1.04) 0.94 (0.77, 1.12)
9_28 0.79 (0.6, 1) 0.92 (0.68, 1.23) 0.91 (0.71, 1.17) 0.97 (0.73, 1.28)
9_29 0.97 (0.64, 1.38) 0.99 (0.64, 1.51) 1.09 (0.74, 1.59) 0.75 (0.36, 1.6)
9_30 0.85 (0.76, 0.97) 0.96 (0.68, 1.34) 0.82 (0.6, 1.11) 0.82 (0.64, 1.03)
9_31 1.07 (0.69, 1.38) 1.24 (0.8, 1.84) 1.27 (0.86, 1.85) 0.97 (0.72, 1.3)
9_32 0.82 (0.66, 0.99) 0.61 (0.37, 1) 0.78 (0.55, 1.11) 0.78 (0.61, 0.98)
9_4 0.96 (0.8, 1.14) 1.19 (0.74, 1.87) 1.26 (0.85, 1.87) 0.95 (0.77, 1.16)
9_5 0.76 (0.51, 1.06) 0.65 (0.44, 0.92) 0.69 (0.49, 0.96) 0.56 (0.38, 0.85)
9_7 0.96 (0.81, 1.1) 1.09 (0.83, 1.42) 1.2 (0.93, 1.55) 0.85 (0.69, 1.02)
9_8 1.14 (0.97, 1.32) 1.32 (0.98, 1.8) 0.98 (0.7, 1.37) 0.97 (0.81, 1.14)
9_9 1.1 (0.98, 1.22) 1.08 (0.82, 1.43) 1.07 (0.85, 1.35) 1.08 (0.94, 1.23)

Cases averted

How many cases were prevented from the time of vaccine introduction to the last time point in each stratum (+/- 95% CrI)? You can modify the number ‘last.point’ to pull out the cumulative number of cases at any point in the time series

last.point<-dim(impact_results$best$cumsum_prevented)[1]
cum.prevented<-impact_results$best$cumsum_prevented[last.point,,]

Number of variables selected in SC analysis

The Synthetic controls analysis uses a Bayesian variable selection algorithm to weight the candidate covariates. In each MCMC iteration, it tests a different combination of variables. The model size indicates how many variables are selected in any given model. If <1 variable is selected on average, this indicates that no suitable control variables were identified. In this example 1-2 variables were selected on average in the 2-23m and 2-59 m age categories, while no controls were identified in the 24-59m age group (the average model size is <1 (0.44)).

model_size = data.frame(t(analysis$model_size))
htmlTable(model_size, align='c')
X9_1 X9_10 X9_11 X9_12 X9_13 X9_14 X9_15 X9_16 X9_17 X9_19 X9_2 X9_20 X9_21 X9_22 X9_23 X9_24 X9_25 X9_26 X9_27 X9_28 X9_29 X9_30 X9_31 X9_32 X9_4 X9_5 X9_7 X9_8 X9_9
1 2.13 3.18 1.88 1.51 2.89 3.61 4.19 2.34 1.7 2.11 1.73 1.4 1.54 1.38 0.69 2.05 2.24 1.69 2.15 2.21 2.51 1.49 0.95 1.54 1.7 2.58 1.93 2.18 3.32

Inclusion Probabilities

Here we can see which variables received the most weight in the models for each strata. Values range from 0-1, with values closer to 1 indicating that the variable was included in a larger proportion of the variable combinations that were tested A value of 1 would indicate that the variable was always included in the model, a value of 0.48 would indicate the variable was included in 48% of the models that were tested.

htmlTable(incl_probs)
Group Greatest Inclusion Variable Greatest Inclusion Probability Second Greatest Inclusion Variable Second Greatest Inclusion Probability Third Greatest Inclusion Variable Third Greatest Inclusion Probability
1 9_1 cJ20_J22 1 P05_07 0.37 ach_noj 0.28
2 9_10 cJ20_J22 0.99 ach_noj 0.89 P00_99 0.88
3 9_11 cJ20_J22 1 P00_99 0.31 ach_noj 0.27
4 9_12 cJ20_J22 1 P05_07 0.22 P00_99 0.11
5 9_13 ach_noj 0.89 P05_07 0.74 P00_99 0.71
6 9_14 cJ20_J22 1 ach_noj 0.7 E00_99 0.69
7 9_15 cJ20_J22 1 E00_99 0.86 P05_07 0.55
8 9_16 cJ20_J22 1 P05_07 0.63 P00_99 0.28
9 9_17 cJ20_J22 1 ach_noj 0.22 P05_07 0.16
10 9_19 cJ20_J22 1 P05_07 0.56 P00_99 0.25
11 9_2 Q00_99 0.8 ach_noj 0.37 P00_99 0.34
12 9_20 cJ20_J22 1 P00_99 0.12 ach_noj 0.11
13 9_21 cJ20_J22 1 ach_noj 0.16 P00_99 0.16
14 9_22 P00_99 0.54 ach_noj 0.49 Q00_99 0.18
15 9_23 P00_99 0.26 ach_noj 0.26 P05_07 0.17
16 9_24 cJ20_J22 1 P05_07 0.31 P00_99 0.26
17 9_25 cJ20_J22 1 ach_noj 0.38 Q00_99 0.35
18 9_26 cJ20_J22 1 P00_99 0.31 ach_noj 0.14
19 9_27 cJ20_J22 1 P05_07 0.84 P00_99 0.13
20 9_28 cJ20_J22 1 P00_99 0.7 ach_noj 0.15
21 9_29 cJ20_J22 1 ach_noj 0.8 P00_99 0.29
22 9_30 cJ20_J22 1 K00_99 0.18 P05_07 0.08
23 9_31 ach_noj 0.26 P00_99 0.23 Z00_99 0.18
24 9_32 cJ20_J22 1 Q00_99 0.2 ach_noj 0.18
25 9_4 cJ20_J22 1 P00_99 0.27 ach_noj 0.22
26 9_5 cJ20_J22 1 ach_noj 0.8 P00_99 0.44
27 9_7 cJ20_J22 1 P00_99 0.27 ach_noj 0.19
28 9_8 cJ20_J22 1 Q00_99 0.34 K00_99 0.26
29 9_9 cJ20_J22 1 A10_B99_nopneumo 0.85 Z00_99 0.77

Generate and save the plots

Plot the results for 1 age group

First look at the results from the synthetic controls model for 1 age group.

This first plot shows a monthly time series, with observed, fitted, and counterfacual values. The observed number of deaths is shown in the black line. The fitted values for the pre-vaccine period are shown in the red dotted line, and the counterfactual estimate with its 95% credible interval is shown as the white dotted line and gray shaded area. if the black line is below the gray shaded area, this would indicate that obsrved cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine.

      plots$groups[[1]]$pred_full 

It is sometimes easier to look at the results if we aggregate the observed and expected values up to the annual time scale. Here the observed values are shown as black dots. When the black dots go below the gray shaded area, this indicates that the observed cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine. The vertical line indicates the date of vaccine introduction. The year when the vaccine is introduced is included to the right of the line, even if the vaccine was introduced part-way through the year. For instance, regardless of whether the the vaccine was introduced in January 2009 or November 2009, the observed dot for 2009 would be to the right of the line.

      plots$groups[[1]]$pred_full_agg 

Finally, we can look at the cumulative cases prevented. In this example, there have been 445 cases prevented (95%CrI: 58, 931) from the time of vaccine introduction to the last day month of the study period. This is calculated by takin the difference between the observed and fitted number of cases in each month, and summing them. If atleast 1 control disease is identified from the synthetic controls model, then the result here is drawn from that model, otherwise, it is drawn from the STL+PCA model.

      plots$groups[[1]]$cumsum_prevented 

Printing plots for all models and age groups

We instead might want to just print everything for all age groups and models. We can use the following code to do that

Plot Observed vs expected yearly time series

for (group in names(plots$groups)) {
      par(mfrow=c(4,1))
      print(plots$groups[[group]]$pred_full_agg )
      print(plots$groups[[group]]$pred_best_agg )
      print(plots$groups[[group]]$pred_time_agg )
      print(plots$groups[[group]]$pred_pca_agg )
}

#> Warning: Removed 1 rows containing missing values (geom_point).

#> Warning: Removed 1 rows containing missing values (geom_point).

#> Warning: Removed 2 rows containing missing values (geom_point).

#> Warning: Removed 2 rows containing missing values (geom_point).

Use HDI intervals instead

The package calcuates both an equal-tail 95% credible interval and a 95% highest posterior density interval (HPD or HDI interval). The latter might be more appropriate when the posterior distribution is skewed. The objects with HDI intervals are saved with an appendix of ‘HDI’ so –impact_results\(best\)pred_quantiles gives the estimate for equal tail intervals while –impact_results\(best\)pred_quantiles_HDI gives the estimate for the HDI interval –impact_results\(best\)cumsum_prevented_hdi gives the estimate of cumulative cases prevented using HPI –impact_results\(best\)ann_pred_HDI gives the annual predicted cases with HDI intervals –impact_results\(best\)rr_mean_HDI gives the overall rate ratio caclulated with HDI intervals – impact_results\(best\)log_rr_hdi gives the pointswise log-RR with HDI intervals Here we compare the version calculated with equal-tailed intervals with HPD intervals. In this example they give very similar coverage

print("Equal tailed intervals")
#> [1] "Equal tailed intervals"
round(impact_results$best$rr_mean[,c(2,1,3)],2)
#>      Point Estimate Lower CI Upper CI
#> 9_1            0.87     0.61     1.15
#> 9_10           0.91     0.71     1.16
#> 9_11           1.00     0.85     1.20
#> 9_12           0.81     0.70     0.99
#> 9_13           0.84     0.58     1.22
#> 9_14           1.22     1.02     1.47
#> 9_15           0.91     0.78     1.04
#> 9_16           0.84     0.70     1.04
#> 9_17           0.76     0.64     0.93
#> 9_19           0.65     0.56     0.83
#> 9_2            0.93     0.73     1.15
#> 9_20           1.00     0.87     1.15
#> 9_21           0.93     0.80     1.05
#> 9_22           0.85     0.67     1.17
#> 9_23           1.01     0.79     1.29
#> 9_24           0.74     0.60     0.89
#> 9_25           0.82     0.66     1.06
#> 9_26           0.66     0.41     0.78
#> 9_27           1.13     1.00     1.28
#> 9_28           0.79     0.60     1.00
#> 9_29           0.97     0.64     1.38
#> 9_30           0.85     0.76     0.97
#> 9_31           0.97     0.72     1.30
#> 9_32           0.82     0.66     0.99
#> 9_4            0.96     0.80     1.14
#> 9_5            0.76     0.51     1.06
#> 9_7            0.96     0.81     1.10
#> 9_8            1.14     0.97     1.32
#> 9_9            1.10     0.98     1.22

print("HPD intervals")
#> [1] "HPD intervals"
round(impact_results$best$rr_mean_hdi,2)
#>      Point Estimate lower upper
#> 9_1            0.87  0.61  1.14
#> 9_10           0.91  0.70  1.14
#> 9_11           1.00  0.85  1.19
#> 9_12           0.81  0.69  0.97
#> 9_13           0.84  0.55  1.18
#> 9_14           1.22  1.01  1.45
#> 9_15           0.91  0.77  1.03
#> 9_16           0.84  0.70  1.04
#> 9_17           0.76  0.62  0.92
#> 9_19           0.65  0.55  0.81
#> 9_2            0.93  0.73  1.15
#> 9_20           1.00  0.87  1.15
#> 9_21           0.93  0.81  1.06
#> 9_22           0.85  0.66  1.15
#> 9_23           1.01  0.79  1.28
#> 9_24           0.74  0.60  0.89
#> 9_25           0.82  0.64  1.04
#> 9_26           0.66  0.41  0.78
#> 9_27           1.13  0.99  1.26
#> 9_28           0.79  0.59  0.98
#> 9_29           0.97  0.64  1.38
#> 9_30           0.85  0.75  0.96
#> 9_31           0.97  0.70  1.27
#> 9_32           0.82  0.66  0.98
#> 9_4            0.96  0.80  1.14
#> 9_5            0.76  0.49  1.04
#> 9_7            0.96  0.81  1.09
#> 9_8            1.14  0.97  1.31
#> 9_9            1.10  0.99  1.23

Compare pointwise coverage of the 95% CrI calculated as HDI or equal-tailed. In this example they are nearly identical. The red line denote the HPD intervals, the gray line denotes the equal-tailed

matplot(impact_results$best$pred_quantiles_HDI[,c(2,3),1], type='l', col='gray', lty=c(2,1,2), bty='l', ylim=c(0, max(impact_results$best$pred_quantiles_HDI[,,1])), ylab='Count')

matplot(impact_results$best$pred_quantiles[,c(1,3),1], type='l', col='red', lty=c(2,1,2), bty='l', ylim=c(0, max(impact_results$best$pred_quantiles_HDI[,,1])), ylab='Count', add=T)
points(analysis$input_data[, analysis$outcome_name])